source('../../workflow/resources/annotateVariants.R')
sampleName <- 'Pr9'
inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
annotations <- annotate_variants(sampleName, inputFolder)
For each cluster (defined by color), we computed a pairwise distance for each mutation pair that indicates how often the two mutations occur in the same private branch of cells from the cluster:
dist(M1, M2) = 0 (for M1 = M2)
dist(M1,M2) = 1 - (%samples where M1 and M2 are both in the same private branch of a cell from the cluster) (elsewise)
A private branch is defined as the path from a leaf to the node just below the LCA of this leaf to another leaf from the same cluster.
This is a generalization of the earlier method to find the top seperating mutations of pairs of leafs. The generalization was necessary to handle the larger clusters that were broken in more than 2 pieces.
clusterName <- "lightcoral"
d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
mat <- as.matrix(d)
mat[1:4, 1:4]
## chr19_36639596 chr2_151810586 chr19_21183023 chr18_46704562
## chr19_36639596 0.000000 0.999475 0.872925 0.985375
## chr2_151810586 0.999475 0.000000 0.999225 0.999925
## chr19_21183023 0.872925 0.999225 0.000000 0.981900
## chr18_46704562 0.985375 0.999925 0.981900 0.000000
For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
coverage$variantName <- rownames(coverage)
head(coverage)
## covScore variantName
## chr19_36639596 0.52 chr19_36639596
## chr2_151810586 0.52 chr2_151810586
## chr19_21183023 0.52 chr19_21183023
## chr18_46704562 0.24 chr18_46704562
## chr6_29112175 0.48 chr6_29112175
## chr19_22316451 0.48 chr19_22316451
coverage <- inner_join(coverage, annotations, by = "variantName")
###Overview To get an overview, we plot the full distance matrix:
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
heatmaply(mat)
mat2 <- mat
diag(mat2) <- 1
min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
selected_muts <- which(min_dist < 0.9) # select those below 0.5 say
mat3 <- mat[selected_muts, selected_muts]
This is what the distance matrix looks like now:
heatmaply(mat3)
To cluster mutations, we create a dendrogram based on the pairwise distances:
mat <- mat3
d_mat <- as.dist(mat)
hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
par(cex = 0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
This looks rather complicated and I won’t continue here, as choosing
clusters would be arbitrary.
This is a 3-cell CTC-cluster, so I am expecting up to 3 distinct genotype clusters.
clusterName <- "sandybrown"
d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
mat <- as.matrix(d)
mat[1:4, 1:4]
## chr19_36639596 chr2_151810586 chr19_21183023 chr18_46704562
## chr19_36639596 0.000000 0.997500 0.980600 0.986225
## chr2_151810586 0.997500 0.000000 0.976600 0.953225
## chr19_21183023 0.980600 0.976600 0.000000 0.882175
## chr18_46704562 0.986225 0.953225 0.882175 0.000000
For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
coverage$variantName <- rownames(coverage)
head(coverage)
## covScore variantName
## chr19_36639596 0.52 chr19_36639596
## chr2_151810586 0.52 chr2_151810586
## chr19_21183023 0.52 chr19_21183023
## chr18_46704562 0.24 chr18_46704562
## chr6_29112175 0.48 chr6_29112175
## chr19_22316451 0.48 chr19_22316451
coverage <- inner_join(coverage, annotations, by = "variantName")
###Overview To get an overview, we plot the full distance matrix:
library(heatmaply)
heatmaply(mat)
mat2 <- mat
diag(mat2) <- 1
min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
selected_muts <- which(min_dist < 0.5) # select those below 0.5 say
mat3 <- mat[selected_muts, selected_muts]
This is what the distance matrix looks like now:
heatmaply(mat3)
To cluster mutations, we create a dendrogram based on the pairwise distances:
mat <- mat3
d_mat <- as.dist(mat)
hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
par(cex = 0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
This dataset is complicated and clusterings are highly unstable/depend strongly on the filter. Nonetheless, for the sake of completeness, we provide a clustering here:
We define a cut point to get distinct branches. These should roughly represent the mutations on the private branches in the posterior sampling. The number of clusters can either be the number of distinct branches in the posterior sampling or decided based on the hierarchical clustering.
par(cex = 0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
abline(h = 0.65, lwd = 2, lty = 2, col = "green")
# geneGroups <- cutree(hc, k = NULL, h = 0.6)
geneGroups <- cutree(hc, k = 3)
To rank the mutations within one cluster, we reduce the distance matrix to the cluster mutations and rank them by their average distance to the other mutations in the cluster.
cluster1 <- names(geneGroups)[geneGroups == 1]
Mutations in cluster:
## [1] "chr6_29112175" "chr19_22316451" "chr10_47995404" "chr22_19361954"
## [5] "chr8_119619349" "chr19_9235515" "chr1_161323626" "chr13_77662146"
## [9] "chr20_34285508" "chr9_92046044" "chr11_19165102" "chr16_77294985"
## [13] "chr3_195720934" "chr1_146962102" "chr1_146966567" "chr12_9307187"
## [17] "chr1_117623122" "chr19_54278989" "chr14_105898065" "chr14_31175192"
## [21] "chr11_60181664" "chr19_22758275" "chr17_36195375" "chr22_18548383"
## [25] "chr12_11031362" "chr9_26997621" "chr1_85081375" "chr20_30399358"
## [29] "chr7_48615301" "chr1_148103850" "chr19_8907754" "chr1_146972854"
## [33] "chr19_44132133" "chr17_76089331" "chr19_44303411" "chr2_110656980"
## [37] "chr6_89083860" "chr4_186611335" "chr9_33541199" "chr14_64541317"
## [41] "chr17_39746793" "chr14_61727504" "chr7_142753027" "chr13_77662254"
## [45] "chr2_53895028" "chr1_146972960" "chr20_16430013" "chr19_20303351"
## [49] "chr13_25097025" "chr19_8896139" "chr3_119866607" "chr19_22757903"
## [53] "chr2_95935635"
Distances within cluster:
d1 <- d[cluster1, cluster1]
Average distance to other mutations in cluster:
colMeans(d1, na.rm = FALSE, dims = 1)
## chr6_29112175 chr19_22316451 chr10_47995404 chr22_19361954 chr8_119619349
## 0.5051887 0.5185321 0.5304627 0.4503991 0.3260170
## chr19_9235515 chr1_161323626 chr13_77662146 chr20_34285508 chr9_92046044
## 0.4979712 0.3831675 0.3311877 0.2622439 0.2336679
## chr11_19165102 chr16_77294985 chr3_195720934 chr1_146962102 chr1_146966567
## 0.2137038 0.2180288 0.3860986 0.4788863 0.4768925
## chr12_9307187 chr1_117623122 chr19_54278989 chr14_105898065 chr14_31175192
## 0.2581118 0.2456033 0.5104783 0.3537783 0.2138981
## chr11_60181664 chr19_22758275 chr17_36195375 chr22_18548383 chr12_11031362
## 0.2691816 0.2133297 0.4411344 0.5162217 0.5151892
## chr9_26997621 chr1_85081375 chr20_30399358 chr7_48615301 chr1_148103850
## 0.2907108 0.3043538 0.6179330 0.2310684 0.4781292
## chr19_8907754 chr1_146972854 chr19_44132133 chr17_76089331 chr19_44303411
## 0.2130759 0.2172278 0.2433382 0.2147557 0.3143189
## chr2_110656980 chr6_89083860 chr4_186611335 chr9_33541199 chr14_64541317
## 0.3100274 0.2574292 0.5136019 0.3205925 0.2134495
## chr17_39746793 chr14_61727504 chr7_142753027 chr13_77662254 chr2_53895028
## 0.2263325 0.5354693 0.2140854 0.3034458 0.4638321
## chr1_146972960 chr20_16430013 chr19_20303351 chr13_25097025 chr19_8896139
## 0.4761618 0.2169637 0.2739896 0.2967245 0.5142203
## chr3_119866607 chr19_22757903 chr2_95935635
## 0.3991698 0.2139939 0.5218410
Technically we should exclude the value on the main diagonal from the average. But it is zero at all positions which means when only ordering the mutations based on the average, it makes no difference.
Top separating mutations (first branch):
(top_names <- names(sort(colMeans(d1, na.rm = FALSE, dims = 1))))
## [1] "chr19_8907754" "chr19_22758275" "chr14_64541317" "chr11_19165102"
## [5] "chr14_31175192" "chr19_22757903" "chr7_142753027" "chr17_76089331"
## [9] "chr20_16430013" "chr1_146972854" "chr16_77294985" "chr17_39746793"
## [13] "chr7_48615301" "chr9_92046044" "chr19_44132133" "chr1_117623122"
## [17] "chr6_89083860" "chr12_9307187" "chr20_34285508" "chr11_60181664"
## [21] "chr19_20303351" "chr9_26997621" "chr13_25097025" "chr13_77662254"
## [25] "chr1_85081375" "chr2_110656980" "chr19_44303411" "chr9_33541199"
## [29] "chr8_119619349" "chr13_77662146" "chr14_105898065" "chr1_161323626"
## [33] "chr3_195720934" "chr3_119866607" "chr17_36195375" "chr22_19361954"
## [37] "chr2_53895028" "chr1_146972960" "chr1_146966567" "chr1_148103850"
## [41] "chr1_146962102" "chr19_9235515" "chr6_29112175" "chr19_54278989"
## [45] "chr4_186611335" "chr19_8896139" "chr12_11031362" "chr22_18548383"
## [49] "chr19_22316451" "chr2_95935635" "chr10_47995404" "chr14_61727504"
## [53] "chr20_30399358"
top_df <- as.data.frame(colMeans(d1[top_names]))
colnames(top_df) <- "avgDist"
top_df$variantName <- rownames(top_df)
top_muts_1 <- inner_join(top_df, coverage, by = "variantName")
# colnames(top_muts_1)[1] <- "mutation"
top_muts_1
## avgDist variantName covScore REF ALT relevant
## 1 0.2130759 chr19_8907754 0.60 C T NONE
## 2 0.2133297 chr19_22758275 0.48 G T MODERATE
## 3 0.2134495 chr14_64541317 0.36 C T NONE
## 4 0.2137038 chr11_19165102 0.44 G T NONE
## 5 0.2138981 chr14_31175192 0.44 C A NONE
## 6 0.2139939 chr19_22757903 0.40 A G MODERATE
## 7 0.2140854 chr7_142753027 0.56 C T NONE
## 8 0.2147557 chr17_76089331 0.40 G A NONE
## 9 0.2169637 chr20_16430013 0.48 G T NONE
## 10 0.2172278 chr1_146972854 0.32 G A NONE
## 11 0.2180288 chr16_77294985 0.40 G T MODERATE
## 12 0.2263325 chr17_39746793 0.28 C A MODERATE
## 13 0.2310684 chr7_48615301 0.44 G A MODERATE
## 14 0.2336679 chr9_92046044 0.32 G T MODERATE
## 15 0.2433382 chr19_44132133 0.48 A T MODERATE
## 16 0.2456033 chr1_117623122 0.40 G A MODERATE
## 17 0.2574292 chr6_89083860 0.28 C T NONE
## 18 0.2581118 chr12_9307187 0.44 G A NONE
## 19 0.2622439 chr20_34285508 0.36 T G MODERATE
## 20 0.2691816 chr11_60181664 0.32 A G MODERATE
## 21 0.2739896 chr19_20303351 0.20 A G NONE
## 22 0.2907108 chr9_26997621 0.40 A T NONE
## 23 0.2967245 chr13_25097025 0.44 T A MODERATE
## 24 0.3034458 chr13_77662254 0.44 G A NONE
## 25 0.3043538 chr1_85081375 0.40 C T MODERATE
## 26 0.3100274 chr2_110656980 0.28 A G NONE
## 27 0.3143189 chr19_44303411 0.44 C A MODERATE
## 28 0.3205925 chr9_33541199 0.56 A T MODERATE
## 29 0.3260170 chr8_119619349 0.36 A T NONE
## 30 0.3311877 chr13_77662146 0.36 C T NONE
## 31 0.3537783 chr14_105898065 0.44 T C NONE
## 32 0.3831675 chr1_161323626 0.36 T C NONE
## 33 0.3860986 chr3_195720934 0.68 T C NONE
## 34 0.3991698 chr3_119866607 0.56 A T MODERATE
## 35 0.4411344 chr17_36195375 0.32 A G MODERATE
## 36 0.4503991 chr22_19361954 0.28 A T NONE
## 37 0.4638321 chr2_53895028 0.40 T C NONE
## 38 0.4761618 chr1_146972960 0.32 A G MODERATE
## 39 0.4768925 chr1_146966567 0.28 G A NONE
## 40 0.4781292 chr1_148103850 0.64 T C NONE
## 41 0.4788863 chr1_146962102 0.28 G A NONE
## 42 0.4979712 chr19_9235515 0.40 G A NONE
## 43 0.5051887 chr6_29112175 0.48 C T NONE
## 44 0.5104783 chr19_54278989 0.44 C A MODERATE
## 45 0.5136019 chr4_186611335 0.40 T A NONE
## 46 0.5142203 chr19_8896139 0.64 T A NONE
## 47 0.5151892 chr12_11031362 0.56 T A NONE
## 48 0.5162217 chr22_18548383 0.24 G T NONE
## 49 0.5185321 chr19_22316451 0.48 C A MODERATE
## 50 0.5218410 chr2_95935635 0.68 T A MODERATE
## 51 0.5304627 chr10_47995404 0.40 C T MODERATE
## 52 0.5354693 chr14_61727504 0.28 C A MODERATE
## 53 0.6179330 chr20_30399358 0.76 A G NONE
print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE")))
## [1] "Number of mutations in cluster sandybrown with moderate functional impact: 22"
print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH")))
## [1] "Number of mutations in cluster sandybrown with high functional impact: 0"
cluster1 <- names(geneGroups)[geneGroups == 2]
Mutations in cluster:
## [1] "chr1_179813738" "chr19_8885072" "chr10_133625438" "chr14_105986963"
## [5] "chr19_22314268" "chr19_23222594" "chr19_8897572" "chr9_35148301"
## [9] "chr10_80504925" "chr16_3494694" "chr13_37033506" "chr3_58869346"
## [13] "chr18_68701696" "chr17_62522250" "chr2_203057333"
Distances within cluster:
d1 <- d[cluster1, cluster1]
Average distance to other mutations in cluster:
colMeans(d1, na.rm = FALSE, dims = 1)
## chr1_179813738 chr19_8885072 chr10_133625438 chr14_105986963 chr19_22314268
## 0.4547167 0.2638583 0.3746217 0.4574950 0.2741500
## chr19_23222594 chr19_8897572 chr9_35148301 chr10_80504925 chr16_3494694
## 0.2653167 0.2985500 0.3067283 0.4608167 0.5275150
## chr13_37033506 chr3_58869346 chr18_68701696 chr17_62522250 chr2_203057333
## 0.5487017 0.4596967 0.3918167 0.5101483 0.4004283
Technically we should exclude the value on the main diagonal from the average. But it is zero at all positions which means when only ordering the mutations based on the average, it makes no difference.
Top separating mutations (second branch):
(top_names <- names(sort(colMeans(d1, na.rm = FALSE, dims = 1))))
## [1] "chr19_8885072" "chr19_23222594" "chr19_22314268" "chr19_8897572"
## [5] "chr9_35148301" "chr10_133625438" "chr18_68701696" "chr2_203057333"
## [9] "chr1_179813738" "chr14_105986963" "chr3_58869346" "chr10_80504925"
## [13] "chr17_62522250" "chr16_3494694" "chr13_37033506"
top_df <- as.data.frame(colMeans(d1[top_names]))
colnames(top_df) <- "avgDist"
top_df$variantName <- rownames(top_df)
top_muts_2 <- inner_join(top_df, coverage, by = "variantName")
# colnames(top_muts_1)[1] <- "mutation"
top_muts_2
## avgDist variantName covScore REF ALT relevant
## 1 0.2638583 chr19_8885072 0.68 C T NONE
## 2 0.2653167 chr19_23222594 0.60 G T MODERATE
## 3 0.2741500 chr19_22314268 0.52 G T MODERATE
## 4 0.2985500 chr19_8897572 0.48 G C MODERATE
## 5 0.3067283 chr9_35148301 0.40 C T NONE
## 6 0.3746217 chr10_133625438 0.52 G T NONE
## 7 0.3918167 chr18_68701696 0.32 A T NONE
## 8 0.4004283 chr2_203057333 0.44 G A MODERATE
## 9 0.4547167 chr1_179813738 0.32 A G NONE
## 10 0.4574950 chr14_105986963 0.56 G A NONE
## 11 0.4596967 chr3_58869346 0.40 G A HIGH
## 12 0.4608167 chr10_80504925 0.28 A T NONE
## 13 0.5101483 chr17_62522250 0.32 C T MODERATE
## 14 0.5275150 chr16_3494694 0.20 C T MODERATE
## 15 0.5487017 chr13_37033506 0.36 C T MODERATE
print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE")))
## [1] "Number of mutations in cluster sandybrown with moderate functional impact: 7"
print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH")))
## [1] "Number of mutations in cluster sandybrown with high functional impact: 1"
cluster1 <- names(geneGroups)[geneGroups == 3]
Mutations in cluster:
## [1] "chr7_65399523" "chr7_64928509"
Distances within cluster:
d1 <- d[cluster1, cluster1]
Average distance to other mutations in cluster:
colMeans(d1, na.rm = FALSE, dims = 1)
## chr7_65399523 chr7_64928509
## 0.1761625 0.1761625
Technically we should exclude the value on the main diagonal from the average. But it is zero at all positions which means when only ordering the mutations based on the average, it makes no difference.
Top separating mutations (second branch):
(top_names <- names(sort(colMeans(d1, na.rm = FALSE, dims = 1))))
## [1] "chr7_65399523" "chr7_64928509"
top_df <- as.data.frame(colMeans(d1[top_names]))
colnames(top_df) <- "avgDist"
top_df$variantName <- rownames(top_df)
top_muts_2 <- inner_join(top_df, coverage, by = "variantName")
# colnames(top_muts_1)[1] <- "mutation"
top_muts_2
## avgDist variantName covScore REF ALT relevant
## 1 0.1761625 chr7_65399523 0.76 A T MODERATE
## 2 0.1761625 chr7_64928509 0.52 T G MODERATE
print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE")))
## [1] "Number of mutations in cluster sandybrown with moderate functional impact: 2"
print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH")))
## [1] "Number of mutations in cluster sandybrown with high functional impact: 0"